In [1]:
%%bash
ls | grep .csv
In [2]:
# built-in libs
import email
# processing libs
import pandas as pd
# display libs
from tqdm import tqdm_notebook
In [3]:
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)
In [4]:
print(emails_df.shape)
emails_df.head()
Out[4]:
In [5]:
emails_df.info()
In [6]:
%time
messages_obj_lst = []
messages_str_lst = []
message_metadata = {}
for i in tqdm_notebook(range(emails_df.shape[0])):
msg = email.message_from_string(emails_df.message[i])
for msg_property in msg:
if msg_property in message_metadata:
message_metadata[msg_property][i] = msg[msg_property]
else:
message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
payload = msg.get_payload() # decode=True
messages_obj_lst.append(msg)
messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
#except KeyboardInterrupt:
# break
print('messages_obj_lst size: %i' % len(messages_obj_lst))
In [7]:
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload = pd.Series(messages_str_lst).values)
# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')
In [8]:
emails_df.head()
Out[8]:
In [9]:
for i in range(50):
print(emails_df.message_obj[i]['Subject'])
In [10]:
del messages_obj_lst
del messages_str_lst
emails_df.drop('message', axis=1, inplace=True)
In [ ]:
In [11]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
In [12]:
train = emails_df[:7000]
test = emails_df[7000:]
In [13]:
trainheadlines = []
for row in range(0,len(train.index)):
trainheadlines.append(emails_df.message_obj[row]['Subject']) # ' '.join(str(x))
trainheadlines = list(filter(None, trainheadlines))
[row for row in trainheadlines[:10]]
Out[13]:
In [14]:
# trainvect = CountVectorizer()
# Trainfeature = trainvect.fit_transform(trainheadlines)
In [15]:
# ####Detailed view of Document Count Matrix
# DTM_With_Colm = pd.DataFrame(Trainfeature.toarray(),columns= trainvect.get_feature_names())
In [16]:
# Trainfeature.shape
In [17]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
In [18]:
%time
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
#Our Document
trainheadlines
# list for tokenized documents in loop
texts = []
# loop through document list
for i in trainheadlines:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
In [19]:
%time
#generate LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1,chunksize=10000,update_every=1)
In [20]:
%time
import pyLDAvis.gensim
print(ldamodel.print_topics(num_topics=10, num_words=3))
In [21]:
ldamodel.print_topics(5)
Out[21]:
In [22]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
news = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary)
In [23]:
news
Out[23]:
In [ ]:
In [24]:
# %%bash
# nvidia-smi
In [ ]:
In [ ]: